Reading

park_visits <- readr::read_csv(here::here("data/park-visits.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   year = col_character(),
##   gnis_id = col_character(),
##   geometry = col_character(),
##   metadata = col_character(),
##   number_of_records = col_double(),
##   parkname = col_character(),
##   region = col_character(),
##   state = col_character(),
##   unit_code = col_character(),
##   unit_name = col_character(),
##   unit_type = col_character(),
##   visitors = col_double()
## )
gas_price <- readr::read_csv(here::here("data/gas-price.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   year = col_double(),
##   gas_current = col_double(),
##   gas_constant = col_double()
## )
state_pop <- readr::read_csv(here::here("data/state-pop.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   year = col_double(),
##   state = col_character(),
##   pop = col_double()
## )

Parks

Initial cleaning

visdat::vis_dat(park_visits)

naniar::miss_var_summary(park_visits)
## # A tibble: 12 x 3
##    variable          n_miss pct_miss
##    <chr>              <int>    <dbl>
##  1 metadata            2712  12.6   
##  2 parkname            2218  10.3   
##  3 visitors               4   0.0186
##  4 year                   0   0     
##  5 gnis_id                0   0     
##  6 geometry               0   0     
##  7 number_of_records      0   0     
##  8 region                 0   0     
##  9 state                  0   0     
## 10 unit_code              0   0     
## 11 unit_name              0   0     
## 12 unit_type              0   0
summary(park_visits)
##      year             gnis_id            geometry           metadata        
##  Length:21560       Length:21560       Length:21560       Length:21560      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  number_of_records   parkname            region             state          
##  Min.   :1         Length:21560       Length:21560       Length:21560      
##  1st Qu.:1         Class :character   Class :character   Class :character  
##  Median :1         Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1                                                                 
##  3rd Qu.:1                                                                 
##  Max.   :1                                                                 
##                                                                            
##   unit_code          unit_name          unit_type            visitors        
##  Length:21560       Length:21560       Length:21560       Min.   :        0  
##  Class :character   Class :character   Class :character   1st Qu.:    39125  
##  Mode  :character   Mode  :character   Mode  :character   Median :   155219  
##                                                           Mean   :  1277105  
##                                                           3rd Qu.:   608144  
##                                                           Max.   :871922828  
##                                                           NA's   :4
# are columns completely unique for their length?
vec_size(park_visits)
## [1] 21560
completely_unique <- function(x) vec_unique_count(x) == 1
prop_unique <- function(x) (vec_unique_count(x) / vec_size(x))

# is everything totally unique?
map_lgl(park_visits, completely_unique)
##              year           gnis_id          geometry          metadata 
##             FALSE             FALSE             FALSE             FALSE 
## number_of_records          parkname            region             state 
##              TRUE             FALSE             FALSE             FALSE 
##         unit_code         unit_name         unit_type          visitors 
##             FALSE             FALSE             FALSE             FALSE
map_lgl(park_visits, completely_unique) %>% any()
## [1] TRUE
map_lgl(park_visits, completely_unique) %>% which()
## number_of_records 
##                 5
# how unique are they?
map_dfr(park_visits, prop_unique) %>% 
  pivot_longer(cols = everything()) %>% 
  arrange(-value)
## # A tibble: 12 x 2
##    name                  value
##    <chr>                 <dbl>
##  1 visitors          0.877    
##  2 unit_name         0.0179   
##  3 gnis_id           0.0175   
##  4 unit_code         0.0175   
##  5 parkname          0.0156   
##  6 metadata          0.0153   
##  7 year              0.00529  
##  8 state             0.00250  
##  9 unit_type         0.00139  
## 10 region            0.000371 
## 11 geometry          0.0000928
## 12 number_of_records 0.0000464

Let’s count the number of visitors

count(park_visits, visitors) 
## # A tibble: 18,916 x 2
##    visitors     n
##       <dbl> <int>
##  1        0   180
##  2        5     2
##  3        7     2
##  4       10     2
##  5       12     2
##  6       14     2
##  7       15     2
##  8       17     2
##  9       19     4
## 10       20     1
## # … with 18,906 more rows

And how many different types of patterns do we have? We can count them twice

count(park_visits, visitors) %>% 
  count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
## # A tibble: 16 x 2
##        n    nn
##    <int> <int>
##  1     1 17419
##  2     2  1058
##  3     3   209
##  4     4   112
##  5     5    53
##  6     6    22
##  7     7    19
##  8     8     7
##  9     9     5
## 10    10     2
## 11    11     3
## 12    13     1
## 13    14     3
## 14    15     1
## 15    16     1
## 16   180     1

Lots of single visits, apparently?

What is in number of records?

n_distinct(park_visits$number_of_records)
## [1] 1
park_visits_tidy <- park_visits %>% 
  mutate(year = parse_number(year)) %>% 
  # doesn't contain anything useful
  select(-number_of_records) %>% 
  # I want the visitor number after year
  relocate(visitors, .after = year)
## Warning: Problem with `mutate()` input `year`.
## ℹ 386 parsing failures.
## row col expected actual
## 301  -- a number  Total
## 309  -- a number  Total
## 338  -- a number  Total
## 370  -- a number  Total
## 387  -- a number  Total
## ... ... ........ ......
## See problems(...) for more details.
## 
## ℹ Input `year` is `parse_number(year)`.
## Warning: 386 parsing failures.
## row col expected actual
## 301  -- a number  Total
## 309  -- a number  Total
## 338  -- a number  Total
## 370  -- a number  Total
## 387  -- a number  Total
## ... ... ........ ......
## See problems(...) for more details.
park_visits_tidy
## # A tibble: 21,560 x 11
##     year visitors gnis_id geometry metadata parkname region state unit_code
##    <dbl>    <dbl> <chr>   <chr>    <chr>    <chr>    <chr>  <chr> <chr>    
##  1  1904     1500 1163670 POLYGON  <NA>     Crater … PW     OR    CRLA     
##  2  1941        0 1531834 MULTIPO… <NA>     Lake Ro… PW     WA    LARO     
##  3  1961    69000 2055170 MULTIPO… <NA>     Lewis a… PW     WA    LEWI     
##  4  1935     2200 1530459 MULTIPO… <NA>     Olympic  PW     WA    OLYM     
##  5  1982   468144 277263  POLYGON  <NA>     Santa M… PW     CA    SAMO     
##  6  1919    64000 578853  MULTIPO… <NA>     <NA>     NE     ME    ACAD     
##  7  1969   448000 1329499 MULTIPO… <NA>     <NA>     IM     TX    AMIS     
##  8  1967   738700 589056  POLYGON  <NA>     <NA>     NE     MD    ASIS     
##  9  1944     1409 1377082 POLYGON  <NA>     <NA>     IM     TX    BIBE     
## 10  1989    81157 302659  POLYGON  <NA>     <NA>     SE     FL    BICY     
## # … with 21,550 more rows, and 2 more variables: unit_name <chr>,
## #   unit_type <chr>

Let’s define this as a time series tsibble. However, to do so, we’ll need to identify the index and key

What defines a park?

Are they the same length?

park_visits_tidy %>% 
  summarise_at(.vars = vars(parkname, unit_code, unit_name),
               .funs = n_distinct)
## # A tibble: 1 x 3
##   parkname unit_code unit_name
##      <int>     <int>     <int>
## 1      337       378       386

Ok let’s go unit_name to keep things simple, especially since parkname has a bunch of missings. I think technically a park could be in multiple states so we won’t add hierarchy here. We’ll also remove the geometry column since it creates issued with tsibble.

inspectdf::inspect_mem(park_visits_tidy) %>% inspectdf::show_plot()

Exploring visitor counts

So the visitor counts is the thing that I’m most interest in, I think.

ggplot(park_visits_tidy,
       aes(x = year,
           y = visitors)) + 
  geom_point()
## Warning: Removed 386 rows containing missing values (geom_point).

outliers?

ggplot(park_visits_tidy,
       aes(x = visitors)) + 
  geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Where do these occur? Let’s make the data a little bit smaller

park_visits_cut <- park_visits_tidy %>% 
  select(year, 
         visitors,
         region:unit_type)

park_visits_cut %>% 
  filter(visitors > 2.5e8)
## # A tibble: 8 x 7
##    year  visitors region state unit_code unit_name              unit_type       
##   <dbl>     <dbl> <chr>  <chr> <chr>     <chr>                  <chr>           
## 1    NA 871922828 NT     NC    BLRI      Blue Ridge Parkway     Parkway         
## 2    NA 521947058 SE     NC    GRSM      Great Smoky Mountains… National Park   
## 3    NA 330201962 NC     VA    GWMP      George Washington Mem… National Parkway
## 4    NA 411700377 PW     NV    LAKE      Lake Mead National Re… National Recrea…
## 5    NA 611031225 PW     CA    GOGA      Golden Gate National … National Recrea…
## 6    NA 329664174 NE     NY    GATE      Gateway National Recr… National Recrea…
## 7    NA 443145232 SE     MS    NATR      Natchez Trace Parkway  Parkway         
## 8    NA 282420671 NE     VA    COLO      Colonial National His… National Histor…

Do we get large numbers of visitors when there are missing values?

library(naniar)

miss_var_summary(park_visits_cut)
## # A tibble: 7 x 3
##   variable  n_miss pct_miss
##   <chr>      <int>    <dbl>
## 1 year         386   1.79  
## 2 visitors       4   0.0186
## 3 region         0   0     
## 4 state          0   0     
## 5 unit_code      0   0     
## 6 unit_name      0   0     
## 7 unit_type      0   0
gg_miss_upset(park_visits_cut)

park_visits_cut_nab <- nabular(park_visits_cut)

park_visits_cut_nab %>% 
  group_by(year_NA) %>% 
  summarise(max_visitors = max(visitors, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
##   year_NA max_visitors
##   <fct>          <dbl>
## 1 !NA         21767176
## 2 NA         871922828
ggplot(park_visits_cut_nab,
       aes(x = year_NA,
           y = visitors)) + 
  geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

OK so based on that I think I should remove the rows that contain missing values for year and visitor_NA.

park_visits_cut_na <- park_visits_cut %>% 
  drop_na()

OK now let’s add on data for the state population, and the gas price, to make things a bit more interesting. Let’s quickly

park_visits_ts <- park_visits_cut_na %>% 
  left_join(gas_price, by = "year") %>% 
  left_join(state_pop, by = c("year", "state")) %>% 
  as_tsibble(key = unit_name,
             index = year)

park_visits_ts
## # A tsibble: 21,174 x 10 [1Y]
## # Key:       unit_name [382]
##     year visitors region state unit_code unit_name unit_type gas_current
##    <dbl>    <dbl> <chr>  <chr> <chr>     <chr>     <chr>           <dbl>
##  1  1934   175000 SE     KY    ABLI      Abraham … National…        0.19
##  2  1935    89355 SE     KY    ABLI      Abraham … National…        0.19
##  3  1936   149665 SE     KY    ABLI      Abraham … National…        0.19
##  4  1937   111840 SE     KY    ABLI      Abraham … National…        0.2 
##  5  1938   121144 SE     KY    ABLI      Abraham … National…        0.2 
##  6  1939   112626 SE     KY    ABLI      Abraham … National…        0.19
##  7  1940   136945 SE     KY    ABLI      Abraham … National…        0.18
##  8  1941   110830 SE     KY    ABLI      Abraham … National…        0.19
##  9  1942    59210 SE     KY    ABLI      Abraham … National…        0.2 
## 10  1943    13520 SE     KY    ABLI      Abraham … National…        0.21
## # … with 21,164 more rows, and 2 more variables: gas_constant <dbl>, pop <dbl>

OK, so now we get some nice spaghetti.

gg_park_visits_spag <- ggplot(park_visits_ts,
       aes(x = year,
           y = visitors,
           group = unit_name)) + 
  geom_line()

Let’s use brolgar to help inspect that

library(brolgar)
gg_park_visits_spag + facet_strata(along = pop)

gg_park_visits_spag + facet_strata(along = gas_constant)

Let’s use keys_near to identify those parks near the five number summary:

park_visits_ts %>% 
  keys_near(visitors)
## # A tibble: 185 x 5
##    unit_name                                 visitors stat  stat_value stat_diff
##    <chr>                                        <dbl> <fct>      <dbl>     <dbl>
##  1 Aniakchak National Monument                      0 min            0         0
##  2 Aniakchak National Preserve                      0 min            0         0
##  3 Appomattox Court House National Historic…   148300 med       148300         0
##  4 Big Hole National Battlefield                    0 min            0         0
##  5 Big Hole National Battlefield                    0 min            0         0
##  6 Brices Cross Roads National Battlefield …        0 min            0         0
##  7 Brices Cross Roads National Battlefield …        0 min            0         0
##  8 Brices Cross Roads National Battlefield …        0 min            0         0
##  9 Brices Cross Roads National Battlefield …        0 min            0         0
## 10 Brices Cross Roads National Battlefield …        0 min            0         0
## # … with 175 more rows

huh, filter out the zeros?

park_visits_ts %>% 
  filter(visitors > 0) %>% 
  keys_near(visitors) 
## # A tibble: 10 x 5
##    unit_name                                 visitors stat  stat_value stat_diff
##    <chr>                                        <dbl> <fct>      <dbl>     <dbl>
##  1 Appomattox Court House National Historic…   152200 med      152207       7   
##  2 Arkansas Post National Memorial              39703 q_25      39703.      0.25
##  3 Bryce Canyon National Park                  579200 q_75     579200       0   
##  4 Denali National Park                             5 min           5       0   
##  5 Denali National Preserve                         5 min           5       0   
##  6 Everglades National Park                    579200 q_75     579200       0   
##  7 Golden Gate National Recreation Area      21767176 max    21767176       0   
##  8 Herbert Hoover National Historic Site       152214 med      152207       7   
##  9 Lincoln Boyhood National Memorial           152200 med      152207       7   
## 10 Lyndon B. Johnson National Historical Pa…   579200 q_75     579200       0
park_visits_ts_near <- park_visits_ts %>% 
  filter(visitors > 0) %>% 
  keys_near(visitors) %>% 
  select(-visitors) %>% 
  left_join(park_visits_ts, by = "unit_name")

park_visits_ts_near
## # A tibble: 664 x 13
##    unit_name stat  stat_value stat_diff  year visitors region state unit_code
##    <chr>     <fct>      <dbl>     <dbl> <dbl>    <dbl> <chr>  <chr> <chr>    
##  1 Appomatt… med       152207         7  1941    50000 NE     VA    APCO     
##  2 Appomatt… med       152207         7  1942     9750 NE     VA    APCO     
##  3 Appomatt… med       152207         7  1943     3125 NE     VA    APCO     
##  4 Appomatt… med       152207         7  1944     5450 NE     VA    APCO     
##  5 Appomatt… med       152207         7  1945     7850 NE     VA    APCO     
##  6 Appomatt… med       152207         7  1946    19400 NE     VA    APCO     
##  7 Appomatt… med       152207         7  1947    25250 NE     VA    APCO     
##  8 Appomatt… med       152207         7  1948    27750 NE     VA    APCO     
##  9 Appomatt… med       152207         7  1949    34150 NE     VA    APCO     
## 10 Appomatt… med       152207         7  1950    55400 NE     VA    APCO     
## # … with 654 more rows, and 4 more variables: unit_type <chr>,
## #   gas_current <dbl>, gas_constant <dbl>, pop <dbl>

plot these

library(stickylabeller)
ggplot(park_visits_ts_near,
       aes(x = year,
           y = visitors,
           colour = stat)) + 
  geom_line() +
  facet_wrap(~stat + unit_name, 
             scales = "free_y",
             labeller = label_glue("Park: {unit_name} \nNearest to {stat}")) + 
  theme(legend.position = "none")

Now plot them by the slope of year, another way of looking at it

park_visits_ts_lm_near <- park_visits_ts %>% 
  filter(visitors > 0) %>% 
  key_slope(visitors ~ year) %>% 
  keys_near(key = unit_name,
            var = .slope_year,
            top_n = 6) %>% 
  left_join(park_visits_ts, by = "unit_name") 

library(stickylabeller)

ggplot(park_visits_ts_lm_near,
       aes(x = year,
           y = visitors,
           colour = stat)) + 
  geom_smooth(method = "lm", se = FALSE,
              colour = "grey50",
              alpha = 0.5,
              size = 0.5) +
  geom_line() +
  facet_wrap(~stat + unit_name, 
             scales = "free_y",
             labeller = label_glue("Park: {unit_name} \nNearest to {stat} slope"),
             nrow = 10) + 
  theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'

Now can we add in the other variables, population size and gas to see if there’s something interesting there?

park_visits_lm_gas_pop <- park_visits_ts %>% 
  drop_na(pop) %>% 
  filter(visitors > 0) %>% 
  key_slope(visitors ~ year + pop + gas_constant)

park_visits_ts_lm_pop_near <- park_visits_lm_gas_pop %>% 
  keys_near(key = unit_name,
            var = .slope_pop,
            top_n = 6) %>% 
  left_join(park_visits_ts, by = "unit_name") 

library(stickylabeller)

ggplot(park_visits_ts_lm_near,
       aes(x = year,
           y = visitors,
           colour = stat)) + 
  geom_smooth(method = "lm", se = FALSE,
              colour = "grey50",
              alpha = 0.5,
              size = 0.5) +
  geom_line() +
  facet_wrap(~stat + unit_name, 
             scales = "free_y",
             labeller = label_glue("Park: {unit_name} \nNearest to {stat} slope"),
             nrow = 10) + 
  theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'

Are some of them always increasing?

Gas price

summary(gas_price)
##       year       gas_current      gas_constant  
##  Min.   :1929   Min.   :0.1700   Min.   :1.470  
##  1st Qu.:1950   1st Qu.:0.2700   1st Qu.:1.830  
##  Median :1972   Median :0.3600   Median :2.040  
##  Mean   :1972   Mean   :0.9001   Mean   :2.202  
##  3rd Qu.:1994   3rd Qu.:1.1650   3rd Qu.:2.470  
##  Max.   :2015   Max.   :3.6400   Max.   :3.800
ggplot(gas_price,
       aes(x = year,
           y = gas_constant)) + 
  geom_line()

State Pop

state_pop
## # A tibble: 5,916 x 3
##     year state     pop
##    <dbl> <chr>   <dbl>
##  1  1900 AL    1830000
##  2  1901 AL    1907000
##  3  1902 AL    1935000
##  4  1903 AL    1957000
##  5  1904 AL    1978000
##  6  1905 AL    2012000
##  7  1906 AL    2045000
##  8  1907 AL    2058000
##  9  1908 AL    2070000
## 10  1909 AL    2108000
## # … with 5,906 more rows

Reproducibility

Reproducibility receipt
## datetime
Sys.time()
## [1] "2020-10-19 10:52:58 AEDT"
## repository
if(requireNamespace('git2r', quietly = TRUE)) {
  git2r::repository()
} else {
  c(
    system2("git", args = c("log", "--name-status", "-1"), stdout = TRUE),
    system2("git", args = c("remote", "-v"), stdout = TRUE)
  )
}
## Local:    /Users/ntie0001/github/njtierney/tidy-tue-npv
## Head:     nothing commited (yet)
## session info
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] stickylabeller_0.0.0.9100 brolgar_0.0.6.9100       
##  [3] naniar_0.6.0.9000         tsibble_0.9.3.9000       
##  [5] vctrs_0.3.4               forcats_0.5.0            
##  [7] stringr_1.4.0             dplyr_1.0.2              
##  [9] purrr_0.3.4               readr_1.4.0              
## [11] tidyr_1.1.2               tibble_3.0.4             
## [13] ggplot2_3.3.2             tidyverse_1.3.0          
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2           splines_4.0.2        jsonlite_1.7.1      
##  [4] here_0.1             modelr_0.1.8         assertthat_0.2.1    
##  [7] distributional_0.2.0 askpass_1.1          conflicted_1.0.4    
## [10] blob_1.2.1           fabletools_0.2.1     cellranger_1.1.0    
## [13] yaml_2.2.1           progress_1.2.2       lattice_0.20-41     
## [16] pillar_1.4.6         backports_1.1.10     glue_1.4.2          
## [19] visdat_0.5.3         digest_0.6.26        rvest_0.3.6         
## [22] colorspace_1.4-1     Matrix_1.2-18        htmltools_0.5.0     
## [25] plyr_1.8.6           pkgconfig_2.0.3      broom_0.7.0         
## [28] haven_2.3.1          scales_1.1.1         git2r_0.27.1        
## [31] openssl_1.4.3        mgcv_1.8-33          generics_0.0.2      
## [34] farver_2.0.3         ellipsis_0.3.1       UpSetR_1.4.0        
## [37] withr_2.3.0          inspectdf_0.0.9      credentials_1.3.0   
## [40] cli_2.1.0            magrittr_1.5         crayon_1.3.4        
## [43] readxl_1.3.1         memoise_1.1.0        evaluate_0.14       
## [46] fs_1.5.0             fansi_0.4.1          nlme_3.1-149        
## [49] anytime_0.3.9        xml2_1.3.2           tools_4.0.2         
## [52] prettyunits_1.1.1    hms_0.5.3            lifecycle_0.2.0     
## [55] munsell_0.5.0        reprex_0.3.0         compiler_4.0.2      
## [58] rlang_0.4.8          grid_4.0.2           rstudioapi_0.11     
## [61] sys_3.4              labeling_0.3         rmarkdown_2.3.8     
## [64] gtable_0.3.0         DBI_1.1.0            R6_2.4.1            
## [67] gridExtra_2.3        lubridate_1.7.9      knitr_1.30          
## [70] utf8_1.1.4           rprojroot_1.3-2      stringi_1.5.3       
## [73] Rcpp_1.0.5           ggfittext_0.9.0      dbplyr_1.4.4        
## [76] tidyselect_1.1.0     xfun_0.18